Corona Virus Analysis

By Anubhav Sharma

Importing datasets

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
In [2]:
df_confirmed=pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
df_confirmed.head()
Out[2]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 6/18/20 6/19/20 6/20/20 6/21/20 6/22/20 6/23/20 6/24/20 6/25/20 6/26/20 6/27/20
0 NaN Afghanistan 33.0000 65.0000 0 0 0 0 0 0 ... 27532 27878 28424 28833 29157 29481 29640 30175 30451 30616
1 NaN Albania 41.1533 20.1683 0 0 0 0 0 0 ... 1788 1838 1891 1962 1995 2047 2114 2192 2269 2330
2 NaN Algeria 28.0339 1.6596 0 0 0 0 0 0 ... 11385 11504 11631 11771 11920 12076 12248 12445 12685 12968
3 NaN Andorra 42.5063 1.5218 0 0 0 0 0 0 ... 855 855 855 855 855 855 855 855 855 855
4 NaN Angola -11.2027 17.8739 0 0 0 0 0 0 ... 166 172 176 183 186 189 197 212 212 259

5 rows × 162 columns

In [3]:
df_recovered=pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
df_recovered.head()
Out[3]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 6/18/20 6/19/20 6/20/20 6/21/20 6/22/20 6/23/20 6/24/20 6/25/20 6/26/20 6/27/20
0 NaN Afghanistan 33.0000 65.0000 0 0 0 0 0 0 ... 7660 7962 8292 8764 8841 9260 9869 10174 10306 10674
1 NaN Albania 41.1533 20.1683 0 0 0 0 0 0 ... 1086 1114 1126 1134 1159 1195 1217 1250 1298 1346
2 NaN Algeria 28.0339 1.6596 0 0 0 0 0 0 ... 8078 8196 8324 8422 8559 8674 8792 8920 9066 9202
3 NaN Andorra 42.5063 1.5218 0 0 0 0 0 0 ... 792 792 792 792 796 797 797 797 799 799
4 NaN Angola -11.2027 17.8739 0 0 0 0 0 0 ... 64 66 66 77 77 77 77 81 81 81

5 rows × 162 columns

In [4]:
df_death=pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
df_death.head()
Out[4]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 6/18/20 6/19/20 6/20/20 6/21/20 6/22/20 6/23/20 6/24/20 6/25/20 6/26/20 6/27/20
0 NaN Afghanistan 33.0000 65.0000 0 0 0 0 0 0 ... 546 548 569 581 598 618 639 675 683 703
1 NaN Albania 41.1533 20.1683 0 0 0 0 0 0 ... 39 42 43 44 44 45 47 49 51 53
2 NaN Algeria 28.0339 1.6596 0 0 0 0 0 0 ... 811 825 837 845 852 861 869 878 885 892
3 NaN Andorra 42.5063 1.5218 0 0 0 0 0 0 ... 52 52 52 52 52 52 52 52 52 52
4 NaN Angola -11.2027 17.8739 0 0 0 0 0 0 ... 8 8 9 9 10 10 10 10 10 10

5 rows × 162 columns

In [5]:
df_cases=pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv')
df_cases.head()
Out[5]:
Country_Region Last_Update Lat Long_ Confirmed Deaths Recovered Active Incident_Rate People_Tested People_Hospitalized Mortality_Rate UID ISO3
0 Australia 2020-06-28 09:33:51 -25.0000 133.0000 7686.0 104.0 6993.0 589.0 30.188887 NaN NaN 1.353110 36 AUS
1 Austria 2020-06-28 09:33:51 47.5162 14.5501 17654.0 702.0 16401.0 551.0 196.016166 NaN NaN 3.976436 40 AUT
2 Canada 2020-06-28 09:33:51 60.0010 -95.0010 104878.0 8576.0 67445.0 28858.0 277.046771 NaN NaN 8.177120 124 CAN
3 China 2020-06-28 09:33:51 30.5928 114.3055 84743.0 4641.0 79591.0 511.0 6.032920 NaN NaN 5.476559 156 CHN
4 Denmark 2020-06-28 09:33:51 56.2639 9.5018 12875.0 604.0 11708.0 563.0 222.281574 NaN NaN 4.691262 208 DNK
In [6]:
print(df_confirmed.shape)
print(df_death.shape)
print(df_recovered.shape)
print(df_cases.shape)
(266, 162)
(266, 162)
(253, 162)
(188, 14)

Data Preprocessing

In [7]:
#All the first 3 datasets have same number of columns
df_confirmed.columns
Out[7]:
Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '6/18/20', '6/19/20', '6/20/20', '6/21/20', '6/22/20', '6/23/20',
       '6/24/20', '6/25/20', '6/26/20', '6/27/20'],
      dtype='object', length=162)
In [8]:
df_cases.isnull().sum()
Out[8]:
Country_Region           0
Last_Update              0
Lat                      2
Long_                    2
Confirmed                0
Deaths                   0
Recovered                2
Active                   0
Incident_Rate            2
People_Tested          188
People_Hospitalized    188
Mortality_Rate           0
UID                      0
ISO3                     2
dtype: int64
In [9]:
#let's see where we stand on Global level
df_global=df_cases.copy().drop({'Last_Update','Lat','Long_','Country_Region'},axis=1)
global_summary=pd.DataFrame(df_global.sum())
In [10]:
global_summary=global_summary.transpose()
global_summary
Out[10]:
Confirmed Deaths Recovered Active Incident_Rate People_Tested People_Hospitalized Mortality_Rate UID
0 10001527.0 499124.0 5065869.0 4358385.0 36272.237492 0.0 0.0 624.619647 97529.0
In [11]:
global_summary=global_summary.drop({'People_Tested','People_Hospitalized','Incident_Rate','UID'},axis=1)
global_summary.style.format("{:,.0f}")
Out[11]:
Confirmed Deaths Recovered Active Mortality_Rate
0 10,001,527 499,124 5,065,869 4,358,385 625
In [12]:
#Now we will visualize Covid 19 Clusters in the world
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors


import folium
In [13]:
print(df_cases['Lat'].isnull().sum())
print(df_cases['Long_'].isnull().sum())
2
2
In [14]:
df_cases[df_cases['Lat'].isnull()]
Out[14]:
Country_Region Last_Update Lat Long_ Confirmed Deaths Recovered Active Incident_Rate People_Tested People_Hospitalized Mortality_Rate UID ISO3
68 Diamond Princess 2020-06-28 09:33:51 NaN NaN 712.0 13.0 651.0 48.0 NaN NaN NaN 1.825843 9999 NaN
116 MS Zaandam 2020-06-28 09:33:51 NaN NaN 9.0 2.0 NaN 7.0 NaN NaN NaN 22.222222 8888 NaN
In [15]:
df_cases=df_cases.drop(df_cases.index[[68,116]])
df_cases['Lat'].isnull().sum()
Out[15]:
0

Visualizing covid 19 across the world

Most number of cases

We can see that most number of cases are in North and South America followed by India

In [16]:
#total confirmed cases around the world
world_map_confirmed=folium.Map(location=[20,0],tiles='cartodbdark_matter',zoom_start=2)
for lat,lng,confirmed in zip(df_cases['Lat'],df_cases['Long_'],df_cases['Confirmed']):
    label='{}'.format(confirmed)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lng],
                       radius=confirmed/60000,
                       popup=label,
                        color='crimson',
                        fill_color='#DC143C ',
                        fill=True,
                        fill_opacity=0.7,
                        parse_html=False).add_to(world_map_confirmed)

world_map_confirmed
Out[16]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Most Number of recoveries

Here we can see many countries with good recovry number but most number of recoveries are in USA

In [17]:
#Total Recovered Cases 
world_map_recovered=folium.Map(location=[20,0],tiles='cartodbdark_matter',zoom_start=2)
for lat,lng,recovered in zip(df_cases['Lat'],df_cases['Long_'],df_cases['Recovered']):
    label='{}'.format(recovered)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lng],
                       radius=recovered/50000,
                       popup=label,
                       color='#32CD32',
                       fill_color='#32CD32',
                       fill=True,
                       fill_opacity=0.7,
                       parse_html=False).add_to(world_map_recovered)
world_map_recovered
Out[17]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Most number of deaths

so here we can see an interesting visual which tells us that some countries who are not even in top 5 countries having most number of cases have signiificant amount of deaths. India on the other hand have managed to control death rate.

In [18]:
#total number of deaths
world_map_death=folium.Map(location=[20,0],tiles='cartodbdark_matter',zoom_start=2)
for lat,lng,death in zip(df_cases['Lat'],df_cases['Long_'],df_cases['Deaths']):
    label='{}'.format(death)
    label=folium.Popup(label,parse_html=True)
    if(death==0):
        pass
    else:
        folium.CircleMarker([lat,lng],
                       
                       radius=death/5000,
                       popup=label,
                       color='red',
                       fill_color='red',
                       fill=True,
                       fill_opacity=0.7,
                       parse_html=False).add_to(world_map_death)
world_map_death
Out[18]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [19]:
ts_confirmed=pd.DataFrame(df_confirmed.sum()).transpose()
ts_confirmed
Out[19]:
Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 ... 6/18/20 6/19/20 6/20/20 6/21/20 6/22/20 6/23/20 6/24/20 6/25/20 6/26/20 6/27/20
0 5625.379563 5995.278262 555.0 654.0 941.0 1434.0 2118.0 2927.0 5578.0 6166.0 ... 8488976.0 8670323.0 8829186.0 8960607.0 9098643.0 9263935.0 9431350.0 9609829.0 9801572.0 9979535.0

1 rows × 160 columns

Visualizing daily increase , recoveries and deaths of covid 19 around the world

Here we will see that how have we done as a world in this pandemic

In [20]:
ts_confirmed=ts_confirmed.drop({'Lat','Long'},axis=1)
In [21]:
ts_confirmed=ts_confirmed.transpose()
ts_confirmed.head()
Out[21]:
0
1/22/20 555.0
1/23/20 654.0
1/24/20 941.0
1/25/20 1434.0
1/26/20 2118.0
In [22]:
ts_confirmed=ts_confirmed.reset_index()
In [23]:
ts_confirmed.head()
Out[23]:
index 0
0 1/22/20 555.0
1 1/23/20 654.0
2 1/24/20 941.0
3 1/25/20 1434.0
4 1/26/20 2118.0
In [24]:
ts_confirmed=ts_confirmed.rename(columns={'index':'dates',0:'num_cases'})
ts_confirmed.head()
Out[24]:
dates num_cases
0 1/22/20 555.0
1 1/23/20 654.0
2 1/24/20 941.0
3 1/25/20 1434.0
4 1/26/20 2118.0
In [25]:
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
pyo.init_notebook_mode()
In [26]:
ts_recovered=pd.DataFrame(df_recovered.sum())
ts_recovered.head()
Out[26]:
0
Lat 5009.749300
Long 7126.560346
1/22/20 28.000000
1/23/20 30.000000
1/24/20 36.000000
In [27]:
ts_recovered=ts_recovered.drop({'Lat','Long'},axis=0)
ts_recovered=ts_recovered.reset_index()
ts_recovered=ts_recovered.rename(columns={'index':'dates',0:'recovered'})
ts_recovered.head()
Out[27]:
dates recovered
0 1/22/20 28.0
1 1/23/20 30.0
2 1/24/20 36.0
3 1/25/20 39.0
4 1/26/20 52.0
In [28]:
ts_death=pd.DataFrame(df_death.sum())
ts_death.head()
Out[28]:
0
Lat 5625.379563
Long 5995.278262
1/22/20 17.000000
1/23/20 18.000000
1/24/20 26.000000
In [29]:
ts_death=ts_death.drop({'Lat','Long'},axis=0)
ts_death=ts_death.reset_index()
ts_death=ts_death.rename(columns={'index':'dates',0:'deaths'})
ts_death.head()
Out[29]:
dates deaths
0 1/22/20 17.0
1 1/23/20 18.0
2 1/24/20 26.0
3 1/25/20 42.0
4 1/26/20 56.0
In [30]:
# Now we will create a dataframe for active
#series converts normal array into columns
ts_active=pd.Series(
    data=np.array(
    [x1-x2-x3 for (x1,x2,x3) in zip(ts_confirmed['num_cases'],ts_death['deaths'],ts_recovered['recovered'])]),index=ts_confirmed['dates'])
In [31]:
ts_active=ts_active.reset_index()
ts_active.head()
Out[31]:
dates 0
0 1/22/20 510.0
1 1/23/20 606.0
2 1/24/20 879.0
3 1/25/20 1353.0
4 1/26/20 2010.0
In [32]:
ts_active=ts_active.rename(columns={0:'active'})
ts_active.head()
Out[32]:
dates active
0 1/22/20 510.0
1 1/23/20 606.0
2 1/24/20 879.0
3 1/25/20 1353.0
4 1/26/20 2010.0
In [33]:
# Now we will visualize number of cases,active cases,total recoveries and total deaths
trace0=go.Scatter(
        x=ts_confirmed['dates'],
        y=ts_confirmed['num_cases'],
        mode='lines',
        name='Total num of cases')

trace1=go.Scatter(
        x=ts_recovered['dates'],
        y=ts_recovered['recovered'],
        mode='lines',
        name='Total recoveries')

trace2=go.Scatter(
        x=ts_death['dates'],
        y=ts_death['deaths'],
        mode='lines',
        name='Total deaths')

trace3=go.Scatter(
        x=ts_active['dates'],
        y=ts_active['active'],
        mode='lines',
        name='Total active cases')
In [34]:
data=[trace0,trace1,trace2,trace3]
layout=go.Layout(title='Corona Virus scenario')
fig=go.Figure(data,layout)
fig.show()
In [35]:
fig = px.bar(ts_confirmed, x="dates", y='num_cases', title="daily confirmed cases")
fig.show()

Top 20 countrues with most number of cases

In [36]:
#Now we will see top 20 countries
df_top20=df_cases.drop({'Last_Update','Lat','Long_','People_Hospitalized','Mortality_Rate','UID','ISO3'},axis=1)
In [37]:
df_top20.sort_values(by='Confirmed',axis=0,ascending=False,inplace=True)
In [38]:
df_top20=df_top20.head(20)
In [39]:
df_top20.reset_index(drop=True,inplace=True)
In [40]:
df_top20=df_top20.drop({'People_Tested'},axis=1)
In [41]:
df_top20.style.bar(align='left',subset=['Confirmed', 'Deaths','Recovered','Active','Incident_Rate'], color='Red')
Out[41]:
Country_Region Confirmed Deaths Recovered Active Incident_Rate
0 US 2.51032e+06 125539 679308 1.62732e+06 761.936
1 Brazil 1.31367e+06 57070 727715 528882 618.023
2 Russia 633542 9060 398311 226171 434.128
3 India 528859 16095 309713 203051 38.323
4 United Kingdom 311727 43598 1364 266765 459.192
5 Peru 275989 9135 164024 102830 837.044
6 Chile 267766 5347 228055 34364 1400.73
7 Spain 248469 28341 150376 69753 531.43
8 Italy 240136 34716 188584 16836 397.17
9 Iran 220180 10364 180661 29155 262.141
10 Mexico 212802 26381 160721 25700 166.522
11 Pakistan 202955 4118 92624 106213 91.8796
12 France 199473 29781 75773 93919 305.596
13 Turkey 195883 5082 169182 21619 232.257
14 Germany 194693 8968 177628 8097 232.375
15 Saudi Arabia 178504 1511 122128 54865 512.738
16 Bangladesh 137787 1738 55727 80322 83.6648
17 South Africa 131800 2413 67094 62293 222.227
18 Canada 104878 8576 67445 28858 277.047
19 Qatar 93663 110 77225 16328 3250.99

Analysing the situation in India

Now we will analyse the situation of coronavirus in India

In [42]:
#Now we will Analyze India
df_india=pd.DataFrame(df_top20.loc[3,:])
In [43]:
df_india=df_india.transpose()
In [44]:
df_india.reset_index(drop=True,inplace=True)
df_india
Out[44]:
Country_Region Confirmed Deaths Recovered Active Incident_Rate
0 India 528859 16095 309713 203051 38.323
In [45]:
ts_confirmed_india=df_confirmed[df_confirmed['Country/Region'] == 'India']
ts_confirmed_india=ts_confirmed_india.drop({'Province/State','Lat','Long'},axis=1).reset_index(drop=True).sum()
ts_confirmed_india=ts_confirmed_india.to_frame()
ts_confirmed_india=ts_confirmed_india.drop(ts_confirmed_india.index[0])
ts_confirmed_india=ts_confirmed_india.reset_index()
ts_confirmed_india=ts_confirmed_india.rename(columns={'index':'dates',0:'confirmed'})
ts_confirmed_india.head()
Out[45]:
dates confirmed
0 1/22/20 0
1 1/23/20 0
2 1/24/20 0
3 1/25/20 0
4 1/26/20 0
In [46]:
ts_recovered_india=df_recovered[df_recovered['Country/Region']=='India']
ts_recovered_india=ts_recovered_india.drop({'Province/State','Lat','Long'},axis=1).reset_index(drop=True).sum()
ts_recovered_india=ts_recovered_india.to_frame()
ts_recovered_india=ts_recovered_india.drop(ts_recovered_india.index[0])
ts_recovered_india=ts_recovered_india.reset_index()
ts_recovered_india=ts_recovered_india.rename(columns={'index':'dates',0:'recovered'})
ts_recovered_india.head()
Out[46]:
dates recovered
0 1/22/20 0
1 1/23/20 0
2 1/24/20 0
3 1/25/20 0
4 1/26/20 0
In [47]:
ts_death_india=df_death[df_death['Country/Region']=='India']
ts_death_india=ts_death_india.drop({'Province/State','Lat','Long'},axis=1).reset_index(drop=True).sum()
ts_death_india=ts_death_india.to_frame()
ts_death_india=ts_death_india.drop(ts_death_india.index[0])
ts_death_india=ts_death_india.reset_index()
ts_death_india=ts_death_india.rename(columns={'index':'dates',0:'deaths'})
ts_death_india.head()
Out[47]:
dates deaths
0 1/22/20 0
1 1/23/20 0
2 1/24/20 0
3 1/25/20 0
4 1/26/20 0
In [48]:
ts_active_india=pd.Series(
    data=np.array(
    [x1-x2-x3 for (x1,x2,x3) in zip (ts_confirmed_india['confirmed'],ts_death_india['deaths'],ts_recovered_india['recovered'])]),
    index=ts_confirmed_india['dates']
)
ts_active_india=ts_active_india.to_frame()
ts_active_india=ts_active_india.reset_index()
ts_active_india=ts_active_india.rename(columns={'index':'dates',0:'active'})
ts_active_india.head()
Out[48]:
dates active
0 1/22/20 0
1 1/23/20 0
2 1/24/20 0
3 1/25/20 0
4 1/26/20 0
In [49]:
trace0=go.Scatter(
        x=ts_confirmed_india['dates'],
        y=ts_confirmed_india['confirmed'],
        mode='lines',
        name='Total num of cases')

trace1=go.Scatter(
        x=ts_recovered_india['dates'],
        y=ts_recovered_india['recovered'],
        mode='lines',
        name='Total recoveries')

trace2=go.Scatter(
        x=ts_death_india['dates'],
        y=ts_death_india['deaths'],
        mode='lines',
        name='Total deaths')

trace3=go.Scatter(
        x=ts_active_india['dates'],
        y=ts_active_india['active'],
        mode='lines',
        name='Total active cases')

Good rate of recovery

Even though cases are increasing significantly but we have managed to flatten active cases curve and we are having good number of recoveries while supressing the death curve

In [50]:
data=[trace0,trace1,trace2,trace3]
layout=go.Layout(title='Corona Virus scenario in India')
fig=go.Figure(data,layout)
fig.show()
In [51]:
fig = px.bar(ts_confirmed_india, x="dates", y='confirmed', title="Daily confirmed cases")
fig.show()

Analyising situation in various states across India

In [52]:
#Now I will Analyze India's data state wise
india_state=pd.read_csv('https://raw.githubusercontent.com/imdevskp/covid-19-india-data/master/state_level_latest.csv')
In [53]:
india_state.head()
Out[53]:
State Confirmed Recovered Deaths Active Last_Updated_Time Migrated_Other State_code Delta_Confirmed Delta_Recovered Delta_Deaths State_Notes
0 Total 473643 271688 14907 186993 25/06/2020 10:35:13 55 TT 658 0 0 NaN
1 Maharashtra 142899 73792 6739 62353 24/06/2020 23:16:15 15 MH 0 0 0 15 cases were marked as non-covid deaths in MH...
2 Tamil Nadu 67468 37763 866 28839 24/06/2020 19:27:14 0 TN 0 0 0 2 deaths cross notified to other states from C...
3 Delhi 70390 41437 2365 26588 24/06/2020 18:26:15 0 DL 0 0 0 Delhi bulletins in the morning, containing dat...
4 Gujarat 29001 21096 1736 6169 24/06/2020 21:35:26 0 GJ 0 0 0 NaN
In [54]:
india_state.drop({'Last_Updated_Time','Migrated_Other','Delta_Confirmed','Delta_Recovered','Delta_Deaths','State_Notes'},
                axis=1,inplace=True)
In [55]:
india_state.drop(india_state.index[[0,9]],inplace=True)
india_state=india_state.reset_index(drop=True)
india_state.style.bar(subset=['Confirmed','Recovered','Deaths','Active'],color='red')
Out[55]:
State Confirmed Recovered Deaths Active State_code
0 Maharashtra 142899 73792 6739 62353 MH
1 Tamil Nadu 67468 37763 866 28839 TN
2 Delhi 70390 41437 2365 26588 DL
3 Gujarat 29001 21096 1736 6169 GJ
4 Uttar Pradesh 19557 12586 596 6375 UP
5 Rajasthan 16009 12611 375 3023 RJ
6 West Bengal 15173 9702 591 4880 WB
7 Madhya Pradesh 12448 9473 534 2441 MP
8 State Unassigned 8493 0 0 8493 UN
9 Karnataka 10118 6153 164 3797 KA
10 Andhra Pradesh 10331 4779 129 5423 AP
11 Bihar 8273 6106 55 2112 BR
12 Telangana 10444 4361 225 5858 TG
13 Jammu and Kashmir 6422 3818 88 2516 JK
14 Assam 6371 3959 9 2400 AS
15 Odisha 5962 4123 24 1815 OR
16 Punjab 4627 3099 113 1415 PB
17 Kerala 3604 1888 23 1691 KL
18 Uttarakhand 2623 1721 35 850 UT
19 Chhattisgarh 2419 1601 12 806 CT
20 Jharkhand 2219 1575 12 632 JH
21 Tripura 1263 904 1 358 TR
22 Ladakh 932 274 1 657 LA
23 Goa 951 289 2 660 GA
24 Himachal Pradesh 806 466 7 320 HP
25 Manipur 970 328 0 642 MN
26 Chandigarh 420 322 6 92 CH
27 Puducherry 461 176 9 276 PY
28 Nagaland 355 148 0 207 NL
29 Mizoram 145 24 0 121 MZ
30 Arunachal Pradesh 160 38 1 121 AR
31 Sikkim 83 39 0 44 SK
32 Dadra and Nagar Haveli and Daman and Diu 134 30 0 103 DN
33 Andaman and Nicobar Islands 56 41 0 15 AN
34 Meghalaya 46 41 1 4 ML
35 Lakshadweep 0 0 0 0 LD

States with most number of cases

In [56]:
#lets create a bar chart of top 5 States comparing the number of confirmed cases
df_bar=india_state.loc[0:4,['State','Confirmed']]
In [57]:
df_bar.set_index('State',drop=True,inplace=True)
In [58]:
df_bar.plot(kind='bar',figsize=(10,6),rot=90,color='orange')
plt.xlabel('States')
plt.ylabel('Total Confirmed Cases')
plt.title('Top 5 States with Most number of Confirmed Cases')

plt.show()

States with most number of recoveries

Here we see that though Tamil Nadu has more number of cases delhi is leading in number of recoveries

In [59]:
#state with max recoveries
df_bar_recovered=india_state.loc[0:4,['State','Recovered']]
df_bar_recovered.set_index('State',drop=True,inplace=True)
df_bar_recovered.sort_values('Recovered',axis=0,ascending=False,inplace=True)
In [60]:
df_bar_recovered.plot(kind='bar',figsize=(10,6),rot=90,color='green')
plt.xlabel('states')
plt.ylabel('Number of recoveries')
plt.title('States with Most recoveries')
plt.show()

States with most number of deaths

Here we see that Tamil Nadu has managed to contain deaths even though having second most number of cases

In [61]:
#State with Max deaths
df_bar_death=india_state.loc[0:4,['State','Deaths']]
df_bar_death.set_index('State',drop=True,inplace=True)
df_bar_death.sort_values('Deaths',axis=0,ascending=False,inplace=True)
In [62]:
df_bar_death.plot(kind='bar',figsize=(10,6),rot=90,color='red')
plt.xlabel('states')
plt.ylabel('Number of deaths')
plt.title('States with Most Deaths')
plt.show()
#Tamil Nadu despite having Second largest cases is on 4th number in deaths

Here we wil analyse the state with most number of cases per million

In [63]:
import xlsxwriter
path=r'C:\Users\Anubhav\Untitled Folder\\Indian States Population and Area.xlsx'
df = pd.ExcelFile(path)
In [64]:
df.sheet_names
Out[64]:
['Sheet1']
In [65]:
df_pop = df.parse("Sheet1")
In [66]:
df_pop.head()
#I will use this data since we can't Rely on data of 2011 Census and this gives pretty basic Idea
Out[66]:
State Aadhaar assigned as of 2019 Area (per sq km)
0 Delhi 21763471 1483
1 Haryana 28941133 44212
2 Kerala 36475649 38852
3 Himachal Pradesh 7560770 55673
4 Punjab 30355185 50362
In [67]:
df_state_pop=pd.merge(india_state,df_pop,on='State')
In [68]:
df_state_pop.rename(columns={'Aadhaar assigned as of 2019':'Population'},inplace=True)
In [69]:
df_state_pop.head()
Out[69]:
State Confirmed Recovered Deaths Active State_code Population Area (per sq km)
0 Maharashtra 142899 73792 6739 62353 MH 114063427 307713
1 Tamil Nadu 67468 37763 866 28839 TN 72344821 130060
2 Delhi 70390 41437 2365 26588 DL 21763471 1483
3 Gujarat 29001 21096 1736 6169 GJ 62097024 196244
4 Uttar Pradesh 19557 12586 596 6375 UP 203757489 240928
In [70]:
df_state_pop['Cases/million']=(df_state_pop['Confirmed']/df_state_pop['Population'])*1000000
df_state_pop.sort_values('Cases/million',axis=0,ascending=False,inplace=True)
df_state_pop.reset_index(drop=True,inplace=True)
df_state_pop.drop({'Area (per sq km)'},axis=1,inplace=True)
df_state_pop.head()
Out[70]:
State Confirmed Recovered Deaths Active State_code Population Cases/million
0 Ladakh 932 274 1 657 LA 210156 4434.800815
1 Delhi 70390 41437 2365 26588 DL 21763471 3234.318643
2 Maharashtra 142899 73792 6739 62353 MH 114063427 1252.802969
3 Assam 6371 3959 9 2400 AS 6755448 943.090673
4 Tamil Nadu 67468 37763 866 28839 TN 72344821 932.589217
In [71]:
#We will visualize 5 states with most cases/million
df_bar_pop=df_state_pop.loc[0:4,['State','Cases/million']]
df_bar_pop.set_index('State',inplace=True)

Here we see an interesting thing that compared to its population Delhi has a lot more cases as compared to Maharasthra

In [72]:
df_bar_pop.plot(kind='bar',figsize=(10,6))
plt.xlabel('States')
plt.ylabel('Cases/million')
plt.title('States with most cases/million')
plt.show()
In [73]:
import seaborn as sns
plt.figure(figsize = (12,8))
sns.heatmap(df_state_pop.corr(), annot=True)
plt.show()
In [74]:
covid_daily=pd.read_csv('https://raw.githubusercontent.com/imdevskp/covid-19-india-data/master/state_level_daily.csv')
In [75]:
covid_daily.head()
Out[75]:
Unnamed: 0 Date State Confirmed Deceased Recovered State_Name
0 0 01-Apr-20 AN 0 0 0 Andaman and Nicobar Islands
1 1 01-Apr-20 AP 67 0 1 Andhra Pradesh
2 2 01-Apr-20 AR 0 0 0 Arunachal Pradesh
3 3 01-Apr-20 AS 15 0 0 Assam
4 4 01-Apr-20 BR 3 0 0 Bihar

We will here Analyse which state among top 6 has managed to flatten the curve

In [76]:
#Now we will see top 6 states and which were able to flatten the curve
covid_maha=covid_daily[covid_daily['State']=='MH']
covid_maha=covid_maha.drop({'Deceased','Recovered','Unnamed: 0'},axis=1)
covid_maha['Date'] =pd.to_datetime(covid_maha.Date)
covid_maha.sort_values(by='Date',ascending=True,inplace=True)
covid_maha.reset_index(drop=True,inplace=True)
covid_maha.head()
Out[76]:
Date State Confirmed State_Name
0 2020-03-14 MH 14 Maharashtra
1 2020-03-15 MH 18 Maharashtra
2 2020-03-16 MH 6 Maharashtra
3 2020-03-17 MH 3 Maharashtra
4 2020-03-18 MH 3 Maharashtra
In [77]:
covid_del=covid_daily[covid_daily['State']=='DL']
covid_del=covid_del.drop({'Deceased','Recovered','Unnamed: 0'},axis=1)
covid_del['Date'] =pd.to_datetime(covid_del.Date)
covid_del.sort_values(by='Date',ascending=True,inplace=True)
covid_del.reset_index(drop=True,inplace=True)
covid_del.head()
Out[77]:
Date State Confirmed State_Name
0 2020-03-14 DL 7 Delhi
1 2020-03-15 DL 0 Delhi
2 2020-03-16 DL 0 Delhi
3 2020-03-17 DL 1 Delhi
4 2020-03-18 DL 2 Delhi
In [78]:
covid_guj=covid_daily[covid_daily['State']=='GJ']
covid_guj=covid_guj.drop({'Deceased','Recovered','Unnamed: 0'},axis=1)
covid_guj['Date'] =pd.to_datetime(covid_guj.Date)
covid_guj.sort_values(by='Date',ascending=True,inplace=True)
covid_guj.reset_index(drop=True,inplace=True)
covid_guj.head()
Out[78]:
Date State Confirmed State_Name
0 2020-03-14 GJ 0 Gujarat
1 2020-03-15 GJ 0 Gujarat
2 2020-03-16 GJ 0 Gujarat
3 2020-03-17 GJ 0 Gujarat
4 2020-03-18 GJ 0 Gujarat
In [79]:
covid_tn=covid_daily[covid_daily['State']=='TN']
covid_tn=covid_tn.drop({'Deceased','Recovered','Unnamed: 0'},axis=1)
covid_tn['Date'] =pd.to_datetime(covid_tn.Date)
covid_tn.sort_values(by='Date',ascending=True,inplace=True)
covid_tn.reset_index(drop=True,inplace=True)
covid_tn.head()
Out[79]:
Date State Confirmed State_Name
0 2020-03-14 TN 1 Tamil Nadu
1 2020-03-15 TN 0 Tamil Nadu
2 2020-03-16 TN 0 Tamil Nadu
3 2020-03-17 TN 0 Tamil Nadu
4 2020-03-18 TN 1 Tamil Nadu
In [80]:
covid_up=covid_daily[covid_daily['State']=='UP']
covid_up=covid_up.drop({'Deceased','Recovered','Unnamed: 0'},axis=1)
covid_up['Date'] =pd.to_datetime(covid_up.Date)
covid_up.sort_values(by='Date',ascending=True,inplace=True)
covid_up.reset_index(drop=True,inplace=True)
covid_up.head()
Out[80]:
Date State Confirmed State_Name
0 2020-03-14 UP 12 Uttar Pradesh
1 2020-03-15 UP 1 Uttar Pradesh
2 2020-03-16 UP 0 Uttar Pradesh
3 2020-03-17 UP 2 Uttar Pradesh
4 2020-03-18 UP 2 Uttar Pradesh
In [81]:
covid_rj=covid_daily[covid_daily['State']=='RJ']
covid_rj=covid_rj.drop({'Deceased','Recovered','Unnamed: 0'},axis=1)
covid_rj['Date'] =pd.to_datetime(covid_rj.Date)
covid_rj.sort_values(by='Date',ascending=True,inplace=True)
covid_rj.reset_index(drop=True,inplace=True)
covid_rj.head()
Out[81]:
Date State Confirmed State_Name
0 2020-03-14 RJ 3 Rajasthan
1 2020-03-15 RJ 1 Rajasthan
2 2020-03-16 RJ 0 Rajasthan
3 2020-03-17 RJ 0 Rajasthan
4 2020-03-18 RJ 3 Rajasthan
In [82]:
trace0=go.Scatter(
        x=covid_maha['Date'],
        y=covid_maha['Confirmed'],
        mode='lines',
        name='Cases in Maharashtra')

trace1=go.Scatter(
        x=covid_del['Date'],
        y=covid_del['Confirmed'],
        mode='lines',
        name='Cases in Delhi')

trace2=go.Scatter(
        x=covid_tn['Date'],
        y=covid_tn['Confirmed'],
        mode='lines',
        name='Cases in Tamil Nadu')

trace3=go.Scatter(
        x=covid_up['Date'],
        y=covid_up['Confirmed'],
        mode='lines',
        name='Cases in Uttar Pradesh')

trace4=go.Scatter(
        x=covid_rj['Date'],
        y=covid_rj['Confirmed'],
        mode='lines',
        name='Cases in Rajasthan')

trace5=go.Scatter(
        x=covid_guj['Date'],
        y=covid_guj['Confirmed'],
        mode='lines',
        name='Cases in Gujrat')

Here we see that Rajasthan , Gujrat and Uttar Pradesh have almost managed to flatten their curve despite having a lot cases in beginning of pandemic

In [83]:
data=[trace0,trace1,trace2,trace3,trace4,trace5]
layout=go.Layout(title='Which States are flattening Curve')
fig=go.Figure(data,layout)
fig.show()

Analysing Medical scenario in various states

In [84]:
# Now we will see which States are ready to face pandemic 
df_health=pd.read_csv(r'C:\Users\Anubhav\Untitled Folder\datasets_557629_1259476_HospitalBedsIndia.csv')
df_health.head()
Out[84]:
Sno State/UT NumPrimaryHealthCenters_HMIS NumCommunityHealthCenters_HMIS NumSubDistrictHospitals_HMIS NumDistrictHospitals_HMIS TotalPublicHealthFacilities_HMIS NumPublicBeds_HMIS NumRuralHospitals_NHP18 NumRuralBeds_NHP18 NumUrbanHospitals_NHP18 NumUrbanBeds_NHP18
0 1 Andaman & Nicobar Islands 27 4 NaN 3 34 1246 27 575 3 500
1 2 Andhra Pradesh 1417 198 31.0 20 1666 60799 193 6480 65 16658
2 3 Arunachal Pradesh 122 62 NaN 15 199 2320 208 2136 10 268
3 4 Assam 1007 166 14.0 33 1220 19115 1176 10944 50 6198
4 5 Bihar 2007 63 33.0 43 2146 17796 930 6083 103 5936
In [85]:
df_health.columns
Out[85]:
Index(['Sno', 'State/UT', 'NumPrimaryHealthCenters_HMIS',
       'NumCommunityHealthCenters_HMIS', 'NumSubDistrictHospitals_HMIS',
       'NumDistrictHospitals_HMIS', 'TotalPublicHealthFacilities_HMIS',
       'NumPublicBeds_HMIS', 'NumRuralHospitals_NHP18', 'NumRuralBeds_NHP18',
       'NumUrbanHospitals_NHP18', 'NumUrbanBeds_NHP18'],
      dtype='object')
In [86]:
df_health=df_health.drop({'Sno','NumPrimaryHealthCenters_HMIS','NumCommunityHealthCenters_HMIS','NumSubDistrictHospitals_HMIS',
               'NumDistrictHospitals_HMIS','TotalPublicHealthFacilities_HMIS','NumRuralHospitals_NHP18',
                         'NumUrbanHospitals_NHP18'},axis=1)
df_health.head()
Out[86]:
State/UT NumPublicBeds_HMIS NumRuralBeds_NHP18 NumUrbanBeds_NHP18
0 Andaman & Nicobar Islands 1246 575 500
1 Andhra Pradesh 60799 6480 16658
2 Arunachal Pradesh 2320 2136 268
3 Assam 19115 10944 6198
4 Bihar 17796 6083 5936
In [87]:
 df_health['Total Beds'] = df_health.loc[:,'NumPublicBeds_HMIS'].add(df_health.loc[:,'NumRuralBeds_NHP18']).add(df_health.loc[:,'NumUrbanBeds_NHP18'])
In [88]:
df_health.drop({'NumPublicBeds_HMIS','NumRuralBeds_NHP18','NumUrbanBeds_NHP18'},axis=1,inplace=True)
In [89]:
df_health=df_health.rename(columns={'State/UT':'State'})
df_health.head()
Out[89]:
State Total Beds
0 Andaman & Nicobar Islands 2321
1 Andhra Pradesh 83937
2 Arunachal Pradesh 4724
3 Assam 36257
4 Bihar 29815
In [90]:
df_state_health=pd.merge(df_state_pop,df_health,on='State')
df_state_health.head()
Out[90]:
State Confirmed Recovered Deaths Active State_code Population Cases/million Total Beds
0 Delhi 70390 41437 2365 26588 DL 21763471 3234.318643 44955
1 Maharashtra 142899 73792 6739 62353 MH 114063427 1252.802969 120444
2 Assam 6371 3959 9 2400 AS 6755448 943.090673 36257
3 Tamil Nadu 67468 37763 866 28839 TN 72344821 932.589217 150148
4 Goa 951 289 2 660 GA 1587012 599.239325 5679
In [91]:
#We can see that our top 5 affected states are not even in top 10 of beds/million index 
df_state_health['beds/million']=(df_state_health['Total Beds']/df_state_health['Population'])*1000000
df_state_health.sort_values('beds/million',axis=0,ascending=False,inplace=True)
df_state_health.reset_index(drop=True,inplace=True)
df_state_health
Out[91]:
State Confirmed Recovered Deaths Active State_code Population Cases/million Total Beds beds/million
0 Meghalaya 46 41 1 4 ML 978281 47.021255 9042 9242.743138
1 Lakshadweep 0 0 0 0 LD 70374 0.000000 550 7815.386364
2 Puducherry 461 176 9 276 PY 1286189 358.423218 8031 6244.027899
3 Assam 6371 3959 9 2400 AS 6755448 943.090673 36257 5367.075581
4 Sikkim 83 39 0 44 SK 578914 143.371900 2705 4672.542036
5 Chandigarh 420 322 6 92 CH 1131522 371.181471 4534 4006.992352
6 Mizoram 145 24 0 121 MZ 1089577 133.079167 4309 3954.745741
7 Arunachal Pradesh 160 38 1 121 AR 1229964 130.085108 4724 3840.762819
8 Goa 951 289 2 660 GA 1587012 599.239325 5679 3578.422847
9 Nagaland 355 148 0 207 NL 1262729 281.137124 3824 3028.361588
10 Himachal Pradesh 806 466 7 320 HP 7560770 106.602899 21105 2791.382359
11 Tripura 1263 904 1 358 TR 3672693 343.889348 9312 2535.469205
12 Kerala 3604 1888 23 1691 KL 36475649 98.805644 77515 2125.116403
13 Tamil Nadu 67468 37763 866 28839 TN 72344821 932.589217 150148 2075.449188
14 Delhi 70390 41437 2365 26588 DL 21763471 3234.318643 44955 2065.617199
15 Karnataka 10118 6153 164 3797 KA 62462743 161.984561 126498 2025.175231
16 Andhra Pradesh 10331 4779 129 5423 AP 49145456 210.212720 83937 1707.930027
17 Manipur 970 328 0 642 MN 2515724 385.574888 3989 1585.627040
18 West Bengal 15173 9702 591 4880 WB 91928327 165.052498 129729 1411.197225
19 Uttarakhand 2623 1721 35 850 UT 11082791 236.673235 15172 1368.969242
20 Rajasthan 16009 12611 375 3023 RJ 68936999 232.226529 83692 1214.036021
21 Gujarat 29001 21096 1736 6169 GJ 62097024 467.027212 73409 1182.166153
22 Maharashtra 142899 73792 6739 62353 MH 114063427 1252.802969 120444 1055.938816
23 Punjab 4627 3099 113 1415 PB 30355185 152.428654 31460 1036.396253
24 Madhya Pradesh 12448 9473 534 2441 MP 74770270 166.483283 66979 895.797220
25 Chhattisgarh 2419 1601 12 806 CT 27414422 88.238227 23766 866.915961
26 Odisha 5962 4123 24 1815 OR 42825628 139.215705 35016 817.641250
27 Uttar Pradesh 19557 12586 596 6375 UP 203757489 95.981748 134570 660.441983
28 Jharkhand 2219 1575 12 632 JH 34607477 64.119092 18188 525.551169
29 Bihar 8273 6106 55 2112 BR 102714667 80.543512 29815 290.270133
In [92]:
df_bar_beds=df_state_health.loc[0:5,['State','beds/million']]
df_bar_beds.set_index('State',drop=True,inplace=True)
df_bar_beds.head()
Out[92]:
beds/million
State
Meghalaya 9242.743138
Lakshadweep 7815.386364
Puducherry 6244.027899
Assam 5367.075581
Sikkim 4672.542036

Not even one state from top 5 states having most number of cases is in top 10 states of most beds/ million which tells us that we were not ready to face this pandemic

In [93]:
df_bar_beds.plot(kind='bar',figsize=(10,6))
plt.xlabel('States')
plt.ylabel('beds/million')
plt.title('States with most beds per million')
plt.show()
In [ ]: